{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "5747e926",
   "metadata": {},
   "source": [
    "<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/data_connectors/WebPageDemo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30146ad2-f165-4f4b-ae07-fe6597a2964f",
   "metadata": {},
   "source": [
    "# Web Page Reader\n",
    "\n",
    "Demonstrates our web page reader."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c39063b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2315a154-f72d-4447-b1eb-cde9b66868cb",
   "metadata": {},
   "source": [
    "#### Using SimpleWebPageReader"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "bed9032b",
   "metadata": {},
   "source": [
    "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76a5165b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87bf7ecd-50cd-47da-9f0e-bc48d7ae45d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import SummaryIndex\n",
    "from llama_index.readers import SimpleWebPageReader\n",
    "from IPython.display import Markdown, display\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6de3929-51eb-4064-b4b6-c203bb6debc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# NOTE: the html_to_text=True option requires html2text to be installed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "663403de-2e6e-4340-ab8f-8ee681bc06aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = SimpleWebPageReader(html_to_text=True).load_data(\n",
    "    [\"http://paulgraham.com/worked.html\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8cd183a-2423-4a3e-ad92-dfe89ed5454e",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26854cc3-af61-4910-ab6b-3bed6acfb447",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = SummaryIndex.from_documents(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cfdf87a-97cb-481f-ad51-be5bf8b5217f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set Logging to DEBUG for more detailed outputs\n",
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"What did the author do growing up?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7278d033-cae3-4ddf-96bd-75ea570ca53f",
   "metadata": {},
   "outputs": [],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2708dc99-0e4d-4c7e-b180-8392286d87c2",
   "metadata": {},
   "source": [
    "#### Using TrafilaturaWebReader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa2d54c6-c694-4852-a743-165e4777bd56",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.readers import TrafilaturaWebReader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46854f2f-426e-40a3-a87f-5fb51f90e14c",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = TrafilaturaWebReader().load_data(\n",
    "    [\"http://paulgraham.com/worked.html\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80752ad3-1ed8-4695-9247-22efbe475746",
   "metadata": {},
   "outputs": [],
   "source": [
    "index = SummaryIndex.from_documents(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8cc9b154-1dcf-479b-b49b-251874aea506",
   "metadata": {},
   "outputs": [],
   "source": [
    "# set Logging to DEBUG for more detailed outputs\n",
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"What did the author do growing up?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "971b6415-8bcd-4d8b-a1de-9b7ada3cd392",
   "metadata": {},
   "outputs": [],
   "source": [
    "display(Markdown(f\"<b>{response}</b>\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2b6d07c",
   "metadata": {},
   "source": [
    "### Using RssReader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5ad5ca8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index import SummaryIndex\n",
    "from llama_index.readers import RssReader\n",
    "\n",
    "documents = RssReader().load_data(\n",
    "    [\"https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml\"]\n",
    ")\n",
    "\n",
    "index = SummaryIndex.from_documents(documents)\n",
    "\n",
    "# set Logging to DEBUG for more detailed outputs\n",
    "query_engine = index.as_query_engine()\n",
    "response = query_engine.query(\"What happened in the news today?\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  },
  "vscode": {
   "interpreter": {
    "hash": "c32397a35d2e76e766f80c3872b208f0c0029e8a6a9b8e2a8fe7b1641cfa009b"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
